PART 0: Data preprocessing

Ziyu Tao, Shixiang Wang, Chenxu Wu, Huimin Li, Tao Wu, Xiangyu Zhao, Wei Ning, Guangshuai Wang, Xue-Song Liu (Corresponding author)

2022-01-27

In this part, raw data are collected from databases or papers and the core pre-processing steps are described in sections below.

The pre-processing work has been done by setting root path of the project repository as work directory. Therefore, keep in mind the work directory should be properly set if you are interested in reproducing the pre-processing procedure.

Prepare PCAWG datasets

We download PCAWG phenotype and copy number data from UCSC Xena and save them to local machine with R format.

dir.create("Xena")

# Phenotype (Specimen Centric) -----------------------------------------------

library(UCSCXenaTools)
pcawg_phenotype <- XenaData %>%
  dplyr::filter(XenaHostNames == "pcawgHub") %>%
  XenaScan("phenotype") %>%
  XenaScan("specimen centric") %>%
  XenaGenerate() %>%
  XenaQuery()
pcawg_phenotype <- pcawg_phenotype %>%
  XenaDownload(destdir = "Xena", trans_slash = TRUE)

phenotype_list <- XenaPrepare(pcawg_phenotype)
pcawg_samp_info_sp <- phenotype_list[1:4]

saveRDS(pcawg_samp_info_sp, file = "data/pcawg_samp_info_sp.rds")

# PCAWG (Specimen Centric) ------------------------------------------------

download.file(
  "https://pcawg.xenahubs.net/download/20170119_final_consensus_copynumber_sp.gz",
  "Xena/pcawg_copynumber_sp.gz"
)

pcawg_cn <- data.table::fread("Xena/pcawg_copynumber_sp.gz")
pcawg_samp_info_sp <- readRDS("data/pcawg_samp_info_sp.rds")

sex_dt <- pcawg_samp_info_sp$pcawg_donor_clinical_August2016_v9_sp %>%
  dplyr::select(xena_sample, donor_sex) %>%
  purrr::set_names(c("sample", "sex")) %>%
  data.table::as.data.table()

saveRDS(sex_dt, file = "data/pcawg_sex_sp.rds")

pcawg_cn <- pcawg_cn[!is.na(total_cn)]
pcawg_cn$value <- NULL
pcawg_cn <- pcawg_cn[, c(1:5, 7)]
colnames(pcawg_cn)[1:5] <- c("sample", "Chromosome", "Start.bp", "End.bp", "modal_cn")

saveRDS(pcawg_cn, file = "data/pcawg_copynumber_sp.rds")

# LOH ---------------------------------------------------------------------

pcawg_cn <- readRDS("data/pcawg_copynumber_sp.rds")

pcawg_loh <- pcawg_cn %>%
  dplyr::filter(Chromosome %in% as.character(1:22)) %>%
  dplyr::mutate(len = End.bp - Start.bp + 1) %>%
  dplyr::group_by(sample) %>%
  dplyr::summarise(
    n_LOH = sum(minor_cn == 0 & modal_cn > 0 & len >= 1e4)
  ) %>%
  setNames(c("sample", "n_LOH")) %>%
  data.table::as.data.table()

saveRDS(pcawg_loh, file = "data/pcawg_loh.rds")

Generate PCAWG sample-by-component matrix

We then read the copy number data and transform it into a CopyNumber object with R package sigminer. Then sig_tally() is used to generate the matrix for NMF decomposition.

library(sigminer)

# Only focus autosomes, suggested by Prof. Liu
pcawg_cn <- readRDS("data/pcawg_copynumber_sp.rds")
table(pcawg_cn$Chromosome)
pcawg_cn <- pcawg_cn[!Chromosome %in% c("X", "Y")]
cn_obj <- read_copynumber(pcawg_cn,
  add_loh = TRUE,
  loh_min_len = 1e4,
  loh_min_frac = 0.05,
  max_copynumber = 1000L,
  genome_build = "hg19",
  complement = FALSE,
  genome_measure = "called"
)
saveRDS(cn_obj, file = "data/pcawg_cn_obj.rds")

# Tally -------------------------------------------------------------------
library(sigminer)
cn_obj <- readRDS("data/pcawg_cn_obj.rds")
table(cn_obj@data$chromosome)

tally_X <- sig_tally(cn_obj,
  method = "X",
  add_loh = TRUE,
  cores = 10
)
saveRDS(tally_X, file = "data/pcawg_cn_tally_X.rds")
str(tally_X$all_matrices, max.level = 1)

p <- show_catalogue(tally_X,
  mode = "copynumber", method = "X",
  style = "cosmic", y_tr = function(x) log10(x + 1),
  y_lab = "log10(count +1)"
)
p
ggplot2::ggsave("output/pcawg_catalogs_tally_X.pdf", plot = p, width = 16, height = 2.5)

head(sort(colSums(tally_X$nmf_matrix)), n = 50)

## classes without LOH labels
tally_X_noLOH <- sig_tally(cn_obj,
  method = "X",
  add_loh = FALSE,
  cores = 10
)
saveRDS(tally_X_noLOH$all_matrices, file = "data/pcawg_cn_tally_X_noLOH.rds")
str(tally_X_noLOH$all_matrices, max.level = 1)

p <- show_catalogue(tally_X_noLOH,
  mode = "copynumber", method = "X",
  style = "cosmic", y_tr = function(x) log10(x + 1),
  y_lab = "log10(count +1)"
)
p
ggplot2::ggsave("output/pcawg_catalogs_tally_X_noLOH.pdf", plot = p, width = 16, height = 2.5)

The generated catalog profiles for LOH version and non-LOH version component classification can be viewed by the following link:

Prepare TCGA datasets

TCGA allele-specific copy number data are downloaded from GDC portal and transformed into R format by Huimin Li.

The data is go further checked and cleaned by Shixiang.

# Huimin collected TCGA data from GDC portal
# and generate file with format needed by sigminer
# Here I will firstly further clean up the data

library(data.table)
x <- readRDS("data/TCGA/datamicopy.rds")
setDT(x)

colnames(x)[6] <- "minor_cn"
head(x)
x[, sample := substr(sample, 1, 15)]
saveRDS(x, file = "data/TCGA/tcga_cn.rds")

rm(list = ls())

TCGA phenotype data is downloaded from UCSC Xena.

download.file(
  "https://pancanatlas.xenahubs.net/download/Survival_SupplementalTable_S1_20171025_xena_sp.gz",
  "Xena/Survival_SupplementalTable_S1_20171025_xena_sp.gz"
)

tcga_cli <- data.table::fread("Xena/Survival_SupplementalTable_S1_20171025_xena_sp.gz")
table(tcga_cli$`cancer type abbreviation`)
saveRDS(tcga_cli, file = "data/TCGA/tcga_cli.rds")

Generate TCGA sample-by-component matrix

Similar to PCAWG, TCGA matrices are also generated.

library(sigminer)

# Only focus autosomes
tcga_cn <- readRDS("data/TCGA/tcga_cn.rds")
table(tcga_cn$chromosome)
tcga_cn <- tcga_cn[!chromosome %in% c("chrX", "chrY")]
cn_obj <- read_copynumber(
  tcga_cn,
  seg_cols = c("chromosome", "start", "end", "segVal"),
  add_loh = TRUE,
  loh_min_len = 1e4,
  loh_min_frac = 0.05,
  max_copynumber = 1000L,
  genome_build = "hg38",
  complement = FALSE,
  genome_measure = "called"
)
saveRDS(cn_obj, file = "data/TCGA/tcga_cn_obj.rds")

# Tally step
# Generate the counting matrices
# Same as what have done for PCAWG dataset
library(sigminer)
cn_obj <- readRDS("data/tcga_cn_obj.rds")
table(cn_obj@data$chromosome)

tally_X <- sig_tally(cn_obj,
  method = "X",
  add_loh = TRUE,
  cores = 10
)
saveRDS(tally_X, file = "data/TCGA/tcga_cn_tally_X.rds")


p <- show_catalogue(tally_X,
  mode = "copynumber", method = "X",
  style = "cosmic", y_tr = function(x) log10(x + 1),
  y_lab = "log10(count +1)"
)
p
ggplot2::ggsave("output/tcga_catalogs_tally_X.pdf", plot = p, width = 16, height = 2.5)

head(sort(colSums(tally_X$nmf_matrix)), n = 50)

## classes without LOH labels
tally_X_noLOH <- sig_tally(cn_obj,
  method = "X",
  add_loh = FALSE,
  cores = 10
)
saveRDS(tally_X_noLOH$all_matrices, file = "data/TCGA/tcga_cn_tally_X_noLOH.rds")
str(tally_X_noLOH$all_matrices, max.level = 1)

p <- show_catalogue(tally_X_noLOH,
  mode = "copynumber", method = "X",
  style = "cosmic", y_tr = function(x) log10(x + 1),
  y_lab = "log10(count +1)"
)
p
ggplot2::ggsave("output/tcga_catalogs_tally_X_noLOH.pdf", plot = p, width = 16, height = 2.5)

The generated catalog profiles for LOH version and non-LOH version component classification can be viewed by the following link:

Distribution of CNA segment features in PCAWG cancer types

pcawg_cn2 <- readRDS("../data/pcawg_cn_obj.rds") %>%
  .@data
pcawg_types <- readRDS("../data/pcawg_type_info.rds")
pcawg_cn2 <- dplyr::left_join(
  pcawg_cn2,
  pcawg_types,
  by = "sample"
)
pcawg_cn2$length <- pcawg_cn2$end - pcawg_cn2$start
pcawg_cn2$length2 <- log10((pcawg_cn2$length) / 5)

library(ggpubr)
library(ggsci)
# distribution of segment size
p_seg_size <- ggpubr::ggdensity(pcawg_cn2,
  x = "length2",
  facet.by = "cancer_type",
  fill = "cancer_type",
  alpha = 0.8
) +
  scale_fill_igv() +
  scale_color_igv() +
  theme(
    legend.position = "none",
    axis.title.x = element_blank()
  ) +
  scale_x_continuous(
    name = "length2",
    limits = c(3, 7),
    breaks = c(4, 5, 6),
    labels = c("50Kb", "500Kb", "5Mb")
  ) +
  geom_vline(xintercept = c(4, 5, 6), linetype = "dashed")
p_seg_size

# distribution of copy number
high_cancer <- c(
  "Breast", "Liver-HCC", "Ovary-AdenoCA",
  "Panc-AdenoCA", "Prost-AdenoCA", "Eso-AdenoCA"
)

pcawg_cn2$cancer_type <- factor(
  pcawg_cn2$cancer_type,
  levels = c(
    high_cancer,
    setdiff(names(table(pcawg_cn2$cancer_type)), high_cancer)
  )
)
p_cn_number <- ggpubr::ggdensity(pcawg_cn2,
  x = "segVal",
  y = "..count..",
  facet.by = "cancer_type",
  scales = "free_y",
  fill = "cancer_type",
  alpha = 0.8,
) +
  scale_fill_igv() +
  scale_color_igv() +
  theme(
    legend.position = "none",
    axis.title.x = element_blank()
  ) +
  scale_x_continuous(
    name = "segVal",
    limits = c(0, 32),
    breaks = c(0, 2, 16, 32),
    labels = c("0", "2", "16", "32")
  ) +
  geom_vline(xintercept = c(2, 16, 32), linetype = "dashed")
p_cn_number

PCAWG genotype/phenotype data type and source

If no references described, PCAWG genotype/phenotype data used in this study are obtained from UCSC Xena database, the original data source is PCAWG paper collection published in early 2020.

Kim, Hoon, Nam-Phuong Nguyen, Kristen Turner, Sihan Wu, Amit D. Gujar, Jens Luebeck, Jihe Liu, et al. 2020. “Extrachromosomal DNA Is Associated with Oncogene Amplification and Poor Outcome Across Multiple Cancers.” Nature Genetics 52 (9): 891–97. https://doi.org/10.1038/s41588-020-0678-2.
Nguyen, Luan, John Martens, Arne Van Hoeck, and Edwin Cuppen. 2020. “Pan-Cancer Landscape of Homologous Recombination Deficiency.” Preprint. Cancer Biology. https://doi.org/10.1101/2020.01.13.905026.
PCAWG-Structural Variation Working Group, PCAWG Consortium, Lina Sieverling, Chen Hong, Sandra D. Koser, Philip Ginsbach, Kortine Kleinheinz, et al. 2020. “Genomic Footprints of Activated Telomere Maintenance Mechanisms in Cancer.” Nature Communications 11 (1): 733. https://doi.org/10.1038/s41467-019-13824-9.